This notebook accompanies the manuscript titled, "Parent Reflections on ABA: Analysis of 5,450 Open-Ended Responses" and submitted to Behavior Analysis in Practice. The notebook is structured using common markdown practices as outlined below. Readers using Google Colab will find these section titles hyperlinked when viewing in a browser.
# System stuff
import warnings
warnings.filterwarnings('ignore')
# Data manipulation
import pandas as pd
import numpy as np
import string
import re
from collections import Counter
import random
import collections
# Data analysis
from scipy import stats
from scipy.stats import skew
from scipy.stats import skewtest
# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.colors as mcolors
palette='gist_earth_r'
# NLP Packages
from sklearn.impute import KNNImputer
try:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
except:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
try:
import spacy
except:
!pip install spacy
import spacy
from spacy.lang.en import English
parser = English()
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.util import ngrams
from nltk.corpus import stopwords
en_stop = set(nltk.corpus.stopwords.words('english'))
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
try:
import gensim
except:
!pip install gensim
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
try:
import pyLDAvis
except:
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models
import pickle
# Set directory to Google Drive, if desired
from google.colab import drive
drive.mount('/content/gdrive')
%cd './gdrive/[filepath]/' # replace text in brackets with filepath to where you store this script
%cd './gdrive/My Drive/Patient-Satisfaction-NLP/Scripts and Notebooks/' # replace text in brackets with filepath to where you store this script
def barplot(data,x,y,figsize,xlabel,title,ylabel, rotation=False):
"""
Returns the barplots used show in Figure 4 of the manuscript (i.e., horizontal barplots
colored black). Each return of this function generates a single plot. Parameter values follow
standard matplotlib.pyplot and seaborn parameter labeling conventions.
"""
fig, ax = plt.subplots(figsize=figsize)
ax = sns.barplot(x=x, y=y, data=data,color='black')
plt.xlabel('',fontsize=30,labelpad=(16))
plt.yticks(fontsize=50)
if rotation==True:
plt.xticks(fontsize=50, rotation=45)
else:
plt.xticks(fontsize=50)
plt.ylabel(ylabel,fontsize=35,labelpad=(16))
plt.title(title,fontsize=70,pad=40)
right_side=ax.spines['right']
right_side.set_visible(False)
top = ax.spines['top']
top.set_visible(False)
plt.tight_layout()
plt.show()
def histplot(x_col, data, bins='auto', x_label=None, x_rot=0,
x_ticks=None, x_tick_lab=None, xmin=1, xmax=10,
y_label=None, y_rot=0, y_ticks=None,
y_tick_lab=None, y_min=0, y_max=None,
title=None, save_name=None, cum=False,
stat='probability', figsize=(10, 7)):
"""
Returns the histograms used in Figure 3 of the manuscript. Each return of this function
generates a single histogram. Parameter values follow standard matplotlib.pyplot and
seaborn parameter labeling conventions.
"""
plt.figure(figsize=figsize)
ax = sns.histplot(data=data, x=x_col, bins=bins, cumulative=cum, stat=stat)
plt.xlabel(x_label, fontsize=30, labelpad=(24))
plt.xticks(ticks=x_ticks, labels=x_tick_lab, fontsize=22, rotation=x_rot)
plt.xlim(xmin, xmax)
plt.ylabel(y_label, fontsize=30, labelpad=(125), rotation=y_rot)
plt.yticks(ticks=y_ticks,
labels=y_tick_lab,
fontsize=22)
plt.ylim(y_min, y_max)
plt.title(label=title, fontsize=30, pad=40)
right_side = ax.spines["right"]
right_side.set_visible(False)
top = ax.spines["top"]
top.set_visible(False)
plt.savefig(f"../figures/{save_name}.png", bbox_inches='tight')
plt.show()
def line_plot(df, col, save_name):
"""
Returns a simple line plot of specific columnar data (i.e., col) in an existing dataframe (df).
For this analysis, lineplots were used to determine the number of topics that provide the best
balance between complexity and coherence. These plots are viewable in the "LDA Topc Modeling"
section below.
"""
plt.figure(figsize=(10, 7))
sns.lineplot(x='topics', y=col, marker='o',
size=15, color='k', data=df,
legend=False)
plt.ylabel(col, fontsize=24, labelpad=20)
plt.yticks(fontsize=18)
plt.xlabel("Topics", fontsize=24, labelpad=20)
plt.xticks(fontsize=18)
plt.savefig(f'../figures/{save_name}.png', bbox_inches='tight')
plt.show()
def remove_more_punct(text):
"""
Parameters
----------
text : str
String containing text we want to lowercase, remove puncuation, and tokenize.
Returns
-------
Tokenized list of words from passed string.
"""
import re
import string
punct = set(string.punctuation)
text = text.lower()
text = "".join([c for c in text if c not in punct])
text = re.sub(r"""[()\’°""#/@;¢€:£<“>{}«®`©”+=~‘|.!?,]""", "", text)
text = re.sub(r'/[^a-zA-Z]',"",text)
text = ' '.join(text.split())
return text
def ngram_analysis(df, ngram_col, num_n=2, num_of_terms_to_display=15, title='Bigrams', figsize=(20,15)):
"""
Parameters
----------
df : dataframe
DESCRIPTION.
ngram_col : list
List of text (likely a column in a df) containing the pre-processed text ready for ngram analysis.
num_n : int
The number of words for the ngram analysis (e.g., 2=bigram, 3=trigram)
num_of_terms_to_display : int
The number of most frequent terms to display in the figure.
title : str
Title to display on the figure.
figsize : tuple
(Width, Height) of the figures that will be plotted.
Returns
-------
One barplot showing the count of the top N terms following ngram analysis.
One barplot showing the same data but as the percentage of overall ngram responses.
"""
ngrams = df[ngram_col].tolist()
ng_count = collections.Counter([x for sublist in ngrams for x in sublist])
ngram_df = pd.DataFrame.from_dict(ng_count, orient='index').sort_values(by=[0], ascending=False).reset_index(drop=False)
ngram_df.columns = ['index', 'count']
ngram_df['percent'] = 100*round(ngram_df['count']/len(ngram_df), 4)
print('\n\nLength: ', len(ngram_df))
print(ngram_df.iloc[:15, :])
ngram_df['ngram'] = ngram_df['index'].apply(lambda x: remove_more_punct(str(x)))
barplot(data=ngram_df[:num_of_terms_to_display], x='count', y='ngram', figsize=figsize, xlabel='Count',
title=title, ylabel='')
barplot(data=ngram_df[:num_of_terms_to_display], x='percent', y='ngram', figsize=figsize, xlabel='Count',
title=title, ylabel='', rotation=True)
def lda_model_grid(df, col):
"""
Parameters
----------
df : dataframe
Dataframe with the column that contains the pre-processed text you want to conduct grid
search on for LDA topic modeling.
col : str
Name of the column containing the pre-processed text for analysis.
Returns
-------
Dataframe containing the complexity score and coherence score corresponding to the tested
number of topics.
"""
df_text = []
for i in range(len(df)):
tokens = df[col][i]
df_text.append(tok for tok in tokens)
df_text.append(tokens)
df_dict = corpora.Dictionary(df_text)
df_corpus = [df_dict.doc2bow(text) for text in df[col]]
# Create model and save
topics = []
complexity = []
coherence = []
for i in range(2, 11):
print(f"Checking model with {i} topics.")
warnings.filterwarnings('ignore')
# Build model
lda_model = gensim.models.ldamodel.LdaModel(corpus=df_corpus,
id2word=df_dict,
num_topics=i,
random_state=100,
update_every=1,
chunksize=1000,
passes=10,
alpha='auto',
per_word_topics=True)
# Compute complexity and coherence score
cmplx = lda_model.log_perplexity(df_corpus)
coherence_model_lda = CoherenceModel(model=lda_model,
texts=df[col],
dictionary=df_dict,
coherence='c_v')
coh = coherence_model_lda.get_coherence()
topics.append(i)
complexity.append(cmplx)
coherence.append(coh)
# Save as a df
df_df = pd.DataFrame({'topics': topics,
'complexity':complexity,
'coherence':coherence})
return df_df
def lda_one_topic(df, col, topics, savename, grid_cols, fig_height=5):
"""
Parameters
----------
df : dataframe
Dataframe with the column that contains the pre-processed text you want to develop a
final model from via LDA topic modeling.
col : str
Name of the column containing the pre-processed text for analysis.
topics : int
The number of topics to build the model around.
savename : str
The name of the topic model that will be passed to all saved items.
grid_cols : int
The number of columns that should be displayed in the multiplot images generated
from this function.
fig_height : int
The height of the individual plots within the subplots.
Returns
-------
Pandas dataframe containing the words with corresponding weights for the derived topic model.
NB: Additional items are set to save to local folders when this function is run. This
should be commented out if you are not interested in keeping them. Saved items include:
- .gensim model
- .csv of topic words and weights
- .html file resulting from pyLDAvis
"""
df_text = []
for i in range(len(df)):
tokens = df[col][i]
df_text.append(tok for tok in tokens)
df_text.append(tokens)
df_dict = corpora.Dictionary(df_text)
df_corpus = [df_dict.doc2bow(text) for text in df[col]]
# Build model
lda_model = gensim.models.ldamodel.LdaModel(corpus=df_corpus,
id2word=df_dict,
num_topics=topics,
random_state=100,
update_every=1,
chunksize=1000,
passes=10,
alpha='auto',
per_word_topics=True)
lda_model.save(f'../data/07_model_output/{savename}.gensim')
all_topics = lda_model.print_topics(num_words=20)
all_topics = pd.DataFrame(all_topics)
all_topics.to_csv(f'../data/07_model_output/topic_words_{savename}.csv')
# Visualize via facet grid
n_words = 10
topic_words = pd.DataFrame({})
for i, topic in enumerate(lda_model.get_topics()):
top_feature_ids = topic.argsort()[-n_words:][::-1]
feature_values = topic[top_feature_ids]
words = [df_dict[id] for id in top_feature_ids]
topic_df = pd.DataFrame({'value': feature_values, 'word': words, 'topic': i})
topic_words = pd.concat([topic_words, topic_df], ignore_index=True)
g = sns.FacetGrid(topic_words, col="topic", col_wrap=grid_cols*2,
sharey=False, height=5, aspect=0.65)
g.map(plt.barh, "word", "value")
plt.ylabel("")
plt.show()
# Visualize via barplot
topics_ = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in df[col] for w in w_list]
counter = Counter(data_flat)
out = []
for i, topic in topics_:
for word, weight in topic:
out.append([word, i , weight, counter[word]])
temp_df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(round((topics+1)/2), grid_cols, figsize=(grid_cols*8, round((topics+1)/2)*fig_height), sharey=False, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
ax.bar(x='word', height="word_count", data=temp_df.loc[temp_df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=temp_df.loc[temp_df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Frequency', color=cols[i], fontsize=20)
wc_height = temp_df['word_count'].max() + int(0.1*(temp_df['word_count'].max()))
we_height = temp_df['importance'].max() + 0.01
ax_twin.set_ylim(0, we_height); ax.set_ylim(0,wc_height)
ax.set_title('Topic: ' + str(i+1), color=cols[i], fontsize=30)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(temp_df.loc[temp_df.topic_id==i, 'word'], rotation=45, horizontalalignment= 'right', fontsize=20)
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
fig.tight_layout(w_pad=2)
plt.show()
# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[df_corpus]):
topic_weights.append([w for i, w in row_list[0]])
# Array of topic weights
arr = pd.DataFrame(topic_weights).fillna(0).values
# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]
# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)
# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)
# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = topics
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics),
plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)
# Visualize using pyLDAvis
pyLDAvis.enable_notebook()
lda_display = pyLDAvis.gensim_models.prepare(lda_model,
df_corpus,
df_dict,
sort_topics=False)
pyLDAvis.display(lda_display)
pyLDAvis.save_html(lda_display, f'../figures/lda_vis_{savename}.html')
return all_topics
*** WARNING: This notebook is not automaticaly attached to a data source. Running these cells will lead to errors and the removal of the visuals.
# Read in the raw data
all_data = pd.read_csv('../data/01_raw/all_raw_text.csv')
all_data = all_data.drop(['Unnamed: 0'], axis=1)
all_data[::910] # view every 800th row
# Lists to store the sentiment scores in
san = SentimentIntensityAnalyzer()
pos = []
neu = []
neg = []
compound = []
# Get sentiment scores from Vader Model for every open-text response
for i in range(len(all_data)):
sent_dict = san.polarity_scores(all_data['feedback'][i])
pos.append(sent_dict.get('pos'))
neu.append(sent_dict.get('neu'))
neg.append(sent_dict.get('neg'))
compound.append(sent_dict.get('compound'))
all_data['Positive Sentiment Score'] = pos
all_data['Neutral Sentiment Score'] = neu
all_data['Negative Sentiment Score'] = neg
all_data['Overall Sentiment Score'] = compound
# Take a look
all_data[::910]
# Describe
all_data.describe()
# Proportion of responses counted as positive and negative
print("Proportion Positive: ", round((len(all_data[all_data['Overall Sentiment Score']>=0.50])/len(all_data)), 2))
print("Proportion Negative: ", round((len(all_data[all_data['Overall Sentiment Score']<=-0.50])/len(all_data)), 2))
temp_df = all_data[all_data['Overall Sentiment Score']>=0]
temp_df = temp_df[temp_df['Overall Sentiment Score']<=0.5]
print("Proportion Neutral: ", round((len(temp_df)/len(all_data)), 2))
26/41
# Print statistical tests of skewness for the sentiment distributions
for i in ['Positive Sentiment Score', 'Negative Sentiment Score',
'Neutral Sentiment Score', 'Overall Sentiment Score']:
temp_df = all_data[i]
print(i, skew(temp_df), '\n', skewtest(temp_df), '\n')
# Histograms of each
# Quick plots of Vader sentiment before and after
fig, axs = plt.subplots(2, 2, figsize=(15, 10), sharey=False, sharex=False)
sns.histplot(x=all_data['Positive Sentiment Score'], ax=axs[0, 0], stat='percent')
sns.histplot(x=all_data['Neutral Sentiment Score'], ax=axs[0, 1], stat='percent')
sns.histplot(x=all_data['Negative Sentiment Score'], ax=axs[1, 0], stat='percent')
sns.histplot(x=all_data['Overall Sentiment Score'], ax=axs[1, 1], stat='percent')
# Upper left plot
axs[0, 0].set_ylabel('')
axs[0, 0].set_xlabel('')
fig.text(0.25, 0.85, 'Positive Sentiment Scores')
# Upper right plot
axs[0, 1].set_ylabel('')
axs[0, 1].set_xlabel('')
fig.text(0.6, 0.85, 'Neutral Sentiment Scores')
# Lower left plot
axs[1, 0].set_ylabel('')
axs[1, 1].set_ylabel('')
fig.text(0.25, 0.425, 'Negative Sentiment Scores')
# Lower right plot
axs[1, 0].set_xlabel('')
axs[1, 1].set_xlabel('')
fig.text(.6, 0.425, 'Overall Sentiment Scores')
# Main plot
fig.text(0.35, 0.0, 'Sentiment Score', fontsize=40)
fig.text(0.04, 0.15, 'Percentage of Respondents', fontsize=40, rotation='vertical')
# Save it
plt.savefig(fname='../figures/hist_vader.png', bbox_inches='tight')
# Show it
plt.show()
# Violinplot of each
# Quick plots of Vader sentiment before and after
fig, axs = plt.subplots(2, 2, figsize=(15, 10), sharey=False, sharex=False)
sns.violinplot(x=all_data['Positive Sentiment Score'], ax=axs[0, 0], stat='percentage')
sns.violinplot(x=all_data['Neutral Sentiment Score'], ax=axs[0, 1], stat='percentage')
sns.violinplot(x=all_data['Negative Sentiment Score'], ax=axs[1, 0], stat='percentage')
sns.violinplot(x=all_data['Overall Sentiment Score'], ax=axs[1, 1], stat='percentage')
# Upper left plot
axs[0, 0].set_ylabel('')
axs[0, 0].set_xlabel('')
fig.text(0.25, 0.85, 'Positive Sentiment Scores')
# Upper right plot
axs[0, 1].set_ylabel('')
axs[0, 1].set_xlabel('')
fig.text(0.6, 0.85, 'Neutral Sentiment Scores')
# Lower left plot
axs[1, 0].set_ylabel('')
axs[1, 1].set_ylabel('')
fig.text(0.25, 0.425, 'Negative Sentiment Scores')
# Lower right plot
axs[1, 0].set_xlabel('')
axs[1, 1].set_xlabel('')
fig.text(.6, 0.425, 'Overall Sentiment Scores')
# Main plot
fig.text(0.35, 0.0, 'Sentiment Score', fontsize=40)
fig.text(0.04, 0.15, 'Percentage of Respondents', fontsize=40, rotation='vertical')
# Save it
plt.savefig(fname='../figures/violin_vader.png', bbox_inches='tight')
# Show it
plt.show()
# Make sure everything in the col is showing as a string
all_data['feedback'] = all_data['feedback'].astype(str)
# Lower case all text
all_data['feedback_clean'] = all_data['feedback'].str.lower()
all_data[::910]
# Remove stopwords
stop_words = stopwords.words('english') + ['albert', 'centria', 'lauren', 'julien', 'sonnenberg', 'kailee', 'ahss', 'moments', 'bca', 'nov 06 2019', 'evan'] # Add some troublesome words to remove
all_data['feedback_clean'] = all_data['feedback_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
all_data[::910]
# Handle variations present in our text the nltk algorithms don't catch out-of-the-box
punct = set(string.punctuation)
fdbk_cln = []
for i in range(len(all_data)):
save_list = []
temp_list = all_data['feedback_clean'][i]
temp_list = temp_list.replace('team.', 'team')
temp_list = temp_list.replace('son\'s', 'son')
temp_list = temp_list.replace('family.', 'family')
temp_list = temp_list.replace('getting', 'get')
temp_list = temp_list.replace('receiving', 'receive')
temp_list = temp_list.replace('training', 'train')
temp_list = temp_list.replace('there.', 'there')
temp_list = temp_list.replace('program.', 'program')
temp_list = temp_list.replace('months.', 'month')
temp_list = temp_list.replace('provider.', 'provider')
temp_list = temp_list.replace('son.', 'son')
temp_list = temp_list.replace('sons', 'son')
temp_list = temp_list.replace('therapists.', 'therapist')
temp_list = temp_list.replace('worked', 'work')
temp_list = temp_list.replace('therapists', 'therapist')
temp_list = temp_list.replace('son', 'child')
temp_list = temp_list.replace('daughter', 'child')
temp_list = temp_list.replace('moments', '')
for ele in punct:
temp_list = temp_list.replace(ele, '')
fdbk_cln.append(temp_list)
# Add clean data to the all_data df
all_data['feedback_clean'] = fdbk_cln
all_data[::910]
# Remove top 10% most common words
' '.join(fdbk_cln)
new_str = ''
for i in range(len(fdbk_cln)):
new_str = new_str + fdbk_cln[i]
all_words = nltk.tokenize.word_tokenize(new_str)
all_word_dist = nltk.FreqDist(w.lower() for w in all_words)
cutoff = int(len(all_word_dist)*0.1)
most_common = all_word_dist.most_common(cutoff)
most_common.extend(['ve', 'll'])
all_data['feedback_trimmed'] = all_data['feedback_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (most_common)]))
# Tokenize text
all_data['feedback_trimmed'] = all_data['feedback_trimmed'].apply(lambda x: x.split())
all_data[::910]
# Lists to store our ngram tokens
bigrams = []
trigrams = []
# Loop through the dataset and get bigrams and trigrams for each open-text response
for i in range(len(all_data)):
text = all_data['feedback_trimmed'][i] # Isolate the response we're interested in
# Get bigrams if there are enough words available
try:
bg = [' '.join(e) for e in ngrams(text, 2)]
except:
bg = []
bigrams.append(bg)
# Get trigrams if there are enough words available
try:
tg = [' '.join(e) for e in ngrams(text, 3)]
except:
tg = []
trigrams.append(tg)
# Add lists to our working dataframe
all_data['feedback_bigram'] = bigrams
all_data['feedback_trigram'] = trigrams
# Save it
all_data.to_csv('../data/04_feature/cleaned_text_with_features.csv')
# View it
all_data[::910]
# Create dataframe with a trimmed down version of the cols we need
df = pd.DataFrame({'feedback':all_data['feedback_trimmed'],
'bigrams':all_data['feedback_bigram'],
'trigrams':all_data['feedback_trigram'],
'sentiment':all_data['Overall Sentiment Score']})
df = df[df['feedback'].map(lambda d: len(d)) > 0]
df = df.reset_index(drop=True)
# Make sure the cols are numeric
df['sentiment'] = df['sentiment'].astype(float)
df[::875]
# Cleaning out some sticklers
remove_list = ['nov 06 2019', 'south suburbs needs', 'side chicago south', 'south side chicago',
'suburbs needs open', 'center south side', 'need center south', 'something south partner',
'south partner existing', 'chicago south suburbs']
new_col = []
for i in range(len(df)):
temp_list = []
sent = df['trigrams'][i]
for word in sent:
if word in remove_list:
continue
else:
temp_list.append(word)
new_col.append(temp_list)
df['trigram_clean'] = new_col
df[::875]
# Create positive and negative dfs
pos_df = df[df['sentiment']>=0.50]
pos_df = pos_df.reset_index(drop=True)
neg_df = df[df['sentiment']<=-0.50]
neg_df = neg_df.reset_index(drop=True)
# All Unigrams
ngram_analysis(df=df, ngram_col='feedback', num_n=1, title='Bag-of-Words: Overall')
# Positive
ngram_analysis(df=pos_df, ngram_col='feedback', num_n=1, title='Bag-of-Words: Positive')
# Negative
ngram_analysis(df=neg_df, ngram_col='feedback', num_n=1, title='Bag-of-Words: Negative')
# All Bigrams
ngram_analysis(df=df, ngram_col='bigrams', num_n=1, title='Bigrams: Overall', figsize=(22, 15))
# Positive
ngram_analysis(df=pos_df, ngram_col='bigrams', num_n=1, title='Bigrams: Positive', figsize=(22, 15))
# Negative
ngram_analysis(df=neg_df, ngram_col='bigrams', num_n=1, title='Bigrams: Negative', figsize=(22, 15))
# All Trigrams
ngram_analysis(df=df, ngram_col='trigram_clean', num_n=1, title='Trigrams: Overall', figsize=(26, 15))
# Positive
ngram_analysis(df=pos_df, ngram_col='trigram_clean', num_n=1, title='Trigrams: Positive', figsize=(26, 15))
# Negative
ngram_analysis(df=neg_df, ngram_col='trigram_clean', num_n=1, title='Trigrams: Negative', figsize=(26, 15))
# All responses: LDA Grid
all_lda = lda_model_grid(df=df, col='feedback')
line_plot(df=all_lda, col='complexity', save_name='complexity_all')
line_plot(df=all_lda, col='coherence', save_name='coherence_all')
# All responses: Single Model
all_topics = lda_one_topic(df=df, col='feedback', topics=6,
savename='all_responses', grid_cols=2)
# All responses: Bigrams LDA Grid
all_lda = lda_model_grid(df=df, col='bigrams')
line_plot(df=all_lda, col='complexity', save_name='complexity_all')
line_plot(df=all_lda, col='coherence', save_name='coherence_all')
# All responses: Bigrams single model
all_topics = lda_one_topic(df=df, col='bigrams', topics=6,
savename='all_bigrams', grid_cols=2)
# All responses: Trigrams LDA Grid
all_lda = lda_model_grid(df=df, col='trigrams')
line_plot(df=all_lda, col='complexity', save_name='complexity_all')
line_plot(df=all_lda, col='coherence', save_name='coherence_all')
# All responses: Trigrams single model
all_topics = lda_one_topic(df=df, col='trigrams', topics=4,
savename='all_trigrams', grid_cols=2)
# Positive responses: LDA Grid
pos_20_lda = lda_model_grid(df=pos_df, col='feedback')
line_plot(df=pos_20_lda, col='complexity', save_name='complexity_pos20')
line_plot(df=pos_20_lda, col='coherence', save_name='coherence_pos_20')
# Positive responses: Single model
p20_topics = lda_one_topic(df=pos_df, col='feedback', topics=5,
savename='positive_all', grid_cols=2, fig_height=7.5)
# Positive responses: Bigrams LDA Grid
pos_20_lda = lda_model_grid(df=pos_df, col='bigrams')
line_plot(df=pos_20_lda, col='complexity', save_name='complexity_pos20')
line_plot(df=pos_20_lda, col='coherence', save_name='coherence_pos_20')
# Positive responses: Bigrams single model
p20_topics = lda_one_topic(df=pos_df, col='bigrams', topics=7,
savename='positive_bigrams', grid_cols=2, fig_height=6)
# Positive responses: Trigrams LDA Grid
pos_20_lda = lda_model_grid(df=pos_df, col='trigrams')
line_plot(df=pos_20_lda, col='complexity', save_name='complexity_pos20')
line_plot(df=pos_20_lda, col='coherence', save_name='coherence_pos_20')
# Positive responses: Trigrams single model
p20_topics = lda_one_topic(df=pos_df, col='trigrams', topics=2,
savename='positive_trigrams', grid_cols=2)
# Negative responses: LDA Grid
neg_20_lda = lda_model_grid(df=neg_df, col='feedback')
line_plot(df=neg_20_lda, col='complexity', save_name='complexity_neg_20')
line_plot(df=neg_20_lda, col='coherence', save_name='coherence_neg_20')
# Negative responses: Single model
p21_topics = lda_one_topic(df=neg_df, col='feedback', topics=4,
savename='negative_bigrams', grid_cols=2)
# Negative responses: Bigrams LDA Grid
neg_20_lda = lda_model_grid(df=neg_df, col='bigrams')
line_plot(df=neg_20_lda, col='complexity', save_name='complexity_neg_20')
line_plot(df=neg_20_lda, col='coherence', save_name='coherence_neg_20')
# Negative responses: Bigrams single model
p21_topics = lda_one_topic(df=neg_df, col='bigrams', topics=4,
savename='negative_bigrams', grid_cols=2, fig_height=6)
# Negative responses: Trigrams LDA Grid
neg_20_lda = lda_model_grid(df=neg_df, col='trigrams')
line_plot(df=neg_20_lda, col='complexity', save_name='complexity_neg_20')
line_plot(df=neg_20_lda, col='coherence', save_name='coherence_neg_20')
# Negative responses: Trigrams single model
p21_topics = lda_one_topic(df=neg_df, col='trigrams', topics=4,
savename='negative_trigrams', grid_cols=2)
%%shell
jupyter nbconvert --to html /content/nlp_analytics.ipynb